########################################
# This file contains the code to run 
# structural topic models on the Lucid
# data
##########################################

##############################
#Packages
##############################
library(rio)
library(tidyverse)
library(stm)
library(gridExtra)
library(stringr)
library(tidytext)
library(stmprinter)
library(stmBrowser)

##############################
#Loading and Cleaning Data
##############################

####The Data
lucid <- import("./Data/lucid_coded_all.Rda")

####Separating by Party Target and Cleaning Data
reps <- lucid %>%
  filter(rep_extreme == 1) %>%
  select(id, rep_extreme_text, pid_3, op_ideol, 
         knowl_mean, race_eth, gender, age_cat, dem_extreme_1) %>%
  mutate(race_eth = factor(race_eth, 
                           levels=c(1,2,3,4,5), 
                           labels=c("White", "Black", "Hispanic", 
                                    "Asian", "Other")), 
         pid_3 = factor(pid_3, 
                        levels=c(1,2,3), 
                        labels=c("Democrat", "Republican", 
                                 "Independent")), 
         age_cat = factor(age_cat, 
                          levels=c(1,2,3,4,5,6), 
                          labels=c("18-24", "25-34", 
                                   "35-44", "45-54", 
                                   "55-64", "65+")), 
         dem_extreme_1 = factor(dem_extreme_1, 
                                levels=c(0,1), 
                                labels=c("Dems Not Too Extreme", "Dems Too Extreme")))

dems <- lucid %>%
  filter(dem_extreme == 1) %>%
  select(id, dem_extreme_text, pid_3, op_ideol, 
         knowl_mean, race_eth, gender, age_cat, rep_extreme_1) %>%
  mutate(race_eth = factor(race_eth, 
                           levels=c(1,2,3,4,5), 
                           labels=c("White", "Black", "Hispanic", 
                                    "Asian", "Other")), 
         pid_3 = factor(pid_3, 
                        levels=c(1,2,3), 
                        labels=c("Democrat", "Republican", 
                                 "Independent")), 
         age_cat = factor(age_cat, 
                          levels=c(1,2,3,4,5,6), 
                          labels=c("18-24", "25-34", 
                                   "35-44", "45-54", 
                                   "55-64", "65+")), 
         rep_extreme_1 = factor(rep_extreme_1, 
                                levels=c(0,1), 
                                labels=c("Reps Not Too Extreme", "Reps Too Extreme")))

#####Restricting to complete cases; needed for STM
reps1 <- reps %>%
  filter(!is.na(rep_extreme_text)) %>%
  filter(!is.na(dem_extreme_1)) %>%
  filter(!is.na(pid_3)) %>%
  filter(!is.na(op_ideol)) %>%
  filter(!is.na(knowl_mean)) %>%
  filter(!is.na(race_eth)) %>%
  filter(!is.na(gender)) %>%
  filter(!is.na(age_cat))

dems1 <- dems %>%
  filter(!is.na(dem_extreme_text)) %>%
  filter(!is.na(rep_extreme_1)) %>%
  filter(!is.na(pid_3)) %>%
  filter(!is.na(op_ideol)) %>%
  filter(!is.na(knowl_mean)) %>%
  filter(!is.na(race_eth)) %>%
  filter(!is.na(gender)) %>%
  filter(!is.na(age_cat))

nrow(reps)
nrow(reps1)
nrow(dems)
nrow(dems1)

class(reps1)
class(dems1)

##############################
#Preparing the Data
##############################
####First steps
# We want to remove words that directly reference the target of the question
# and those that say "party" as well, since we don't feel these are particularly
# meaningful (much like "Sherlock" isn't in text analyses of a Sherlock Holmes
# novel). To do this, we use the tidytext package to identify the
# words we wish to remove so that they can be entered as custom stop words. 

###republicans
rep_df <- reps1  %>%
  unnest_tokens(word, rep_extreme_text) %>%
  filter(!str_detect(word, "[0-9]+"))

rep_count <- rep_df %>%
  count(word)

rep_df %>%
  count(word) %>%
  export(., "rep_count.csv")

rep_custom <- rep_df %>%
  count(word) %>%
  mutate(repub = grepl("repub", word), 
         party = grepl("part", word), 
         extreme = grepl("extr", word)) %>%
  filter(repub == "TRUE" | party == "TRUE" | extreme == "TRUE")

reps1  %>%
  mutate(extreme = grepl("extr", rep_extreme_text)) %>%
  filter(extreme == "TRUE") %>%
  select(id, rep_extreme_text) %>%
  export(., "rep_extreme_extreme.csv" )

#Democrats
dem_df <- dems1  %>%
  unnest_tokens(word, dem_extreme_text) %>%
  filter(!str_detect(word, "[0-9]+"))

dem_count <- dem_df %>%
  count(word)

dem_df %>%
  count(word) %>%
  export(., "dem_count.csv")

dem_custom <- dem_df %>%
  count(word) %>%
  mutate(democrat = grepl("dem", word), 
         party = grepl("part", word)) %>%
  filter(democrat == "TRUE" | party == "TRUE")

####Republicans
processed_R <- textProcessor(reps1$rep_extreme_text, metadata = reps1, 
                             customstopwords = c("republic", "republican's", "republican", 
                                                 "extreme", "extreamen",
                                                 "republicans", "republicansare", 
                                                 "republicians", "republiconvicts", 
                                                 "repubs", "party", "doesn't", "doesnt", "don't", "dont")) 

plotRemoved(processed_R$documents, lower.thresh=c(1,10, by=1))

out_R <- prepDocuments(processed_R$document, processed_R$vocab, 
                       processed_R$meta, lower.thresh=3)

docs_R <- out_R$documents
vocab_R <- out_R$vocab
meta_R <- out_R$meta
docs_R_removed <- out_R$docs.removed

####Democrats
processed_D <- textProcessor(dems1$dem_extreme_text, metadata = dems1, 
                             customstopwords = c("demacratic", "demacrats", "extreme", 
                                                 "democraft", "democrat", "democrate", 
                                                 "democrates", "democratic", "democrats", 
                                                 "democrsts", "dems", "party", 
                                                 "doesn't", "doesnt", "don't", "dont", 
                                                 "extreme", "extreamen" )) 

plotRemoved(processed_D$documents, lower.thresh=c(1,10, by=1))

out_D <- prepDocuments(processed_D$document, processed_D$vocab, 
                       processed_D$meta, lower.thresh=3)

docs_D <- out_D$documents
vocab_D <- out_D$vocab
meta_D <- out_D$meta
docs_D_removed <- out_D$docs.removed

##############################
#How Many Topics?
##############################
###Republicans
# storage_R <- searchK(out_R$documents, out_R$vocab, K = c(4,5,6,7,8,9,10,11,12,13,14),
#                      prevalence =~ pid_3 + dem_extreme_1 + op_ideol +
#                        knowl_mean + race_eth + gender + age_cat, 
#                      data = out_R$meta, 
#                      seed = 66778)
# 
# plot(storage_R)
# 
# ###Democrats
# storage_D <- searchK(out_D$documents, out_D$vocab, K = c(4,5,6,7,8,9,10,11,12,13,14),
#                      prevalence =~ pid_3 + rep_extreme_1 + op_ideol +
#                        knowl_mean + race_eth + gender + age_cat,
#                      data = out_D$meta, 
#                      seed = 66778)
# 
# plot(storage_D)
# 
# ###Figures
# #Extract the results for the two parties
# rep_fits <- storage_R[["results"]]
# rep_fits$target <- "Republican Party"
# 
# dem_fits <- storage_D[["results"]]
# dem_fits$target <- "Democratic Party"
# 
# #Bind them together
# fits <- rbind(rep_fits, dem_fits)
# 
# #plot them
# fits %>%
#   select(K, semcoh, heldout, residual, lbound, target) %>%
#   gather("fitstat", "outcome", -K, -target) %>%
#   mutate(fitstat = factor(fitstat, 
#                           levels=c("semcoh", 
#                                    "lbound", 
#                                    "residual", 
#                                    "heldout"), 
#                           labels=c("Semantic Coherence", 
#                                    "Lower Bound", 
#                                    "Residuals", 
#                                    "Held Out LIkelihood"))) %>%
#   ggplot(aes(x = K, y = outcome ,shape = target, color=target, line = target)) + 
#   geom_point() + 
#   geom_line() + 
#   facet_wrap(~fitstat, scales="free_y") + 
#   theme_minimal() + 
#   labs(y="Fit Statistic", x = "Number Topics") + 
#   theme(strip.text.x = element_text(
#     size=16, face="bold"))
#     
# #save plot
# ggsave("stm_fitstat.png", height=7, width=10)
# #Based on the above, we will go with six topics
# 
# ##semantic coherence by exclusivity
# fits %>%
#   select(K, semcoh, exclus, target) %>%
#   ggplot(aes(x=semcoh, y=exclus, shape=target, color=target)) + 
#   geom_point() + 
#   facet_wrap(K~.) + 
#   theme_minimal() + 
#   labs(title="Semantic Coherence by Exclusivity", 
#        subtitle = "Subplots by #Topics",
#        x = "Semantic Coherence", 
#        y = "Exclusivity")
# 
# ggsave("stm_fitstat_1.png", height=7, width=10)


##############################
#The Model
##############################

###Republicans
stm_rep <- stm(documents = out_R$documents, 
               vocab = out_R$vocab,
               K = 7,
               prevalence =~ pid_3 + dem_extreme_1 + op_ideol + 
                       knowl_mean + race_eth + gender + age_cat,
               data = out_R$meta, 
               init.type="Spectral", seed = 66778)

###Democrats
stm_dem <- stm(documents = out_D$documents, 
               vocab = out_D$vocab,
               K = 7,
               prevalence =~ pid_3 + rep_extreme_1 + op_ideol + 
                 knowl_mean + race_eth + gender + age_cat, 
               data = out_D$meta, 
               init.type="Spectral", seed = 66778)


##############################
#Interpretation
##############################
####Plot of topic prevalance and most probable words
###Tidy'd versions with betas
tidy_rep <- tidy(stm_rep)
tidy_dem <- tidy(stm_dem)

###Gammas, i.e. topic probabilities
gamma_R <- tidy(stm_rep, matrix = "gamma",
                 document_names = rownames(out_R$documents))

gamma_D <- tidy(stm_dem, matrix = "gamma", 
                document_names = rownames(out_D$documents))

####Figures
##Republicans
top_R <- tidy_rep %>%
  arrange(beta) %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  arrange(-beta) %>%
  select(topic, term) %>%
  summarise(terms = list(term)) %>%
  mutate(terms = map(terms, paste, collapse = ", ")) %>% 
  unnest()

gamma_val_R <- gamma_R %>%
  group_by(topic) %>%
  summarise(gamma = mean(gamma)) %>%
  arrange(desc(gamma)) %>%
  left_join(top_R, by = "topic") %>%
  mutate(topic = paste0("Topic ", topic),
         topic = reorder(topic, gamma))

gamma_val_R$gamma

library(scales)
rfig <- gamma_val_R %>%
  ggplot(aes(topic, gamma, label = terms)) +
  geom_col(show.legend = FALSE) +
  geom_text(hjust = 0, nudge_y = 0.0005, size = 4) +
  coord_flip() +
  scale_y_continuous(expand = c(0,0),
                     limits = c(0, 0.35),
                     labels = percent_format()) +
  theme_minimal() + 
  labs(x = NULL, y = expression(gamma),
       title = "Republican Party")

##Democrats
top_D <- tidy_dem %>%
  arrange(beta) %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  arrange(-beta) %>%
  select(topic, term) %>%
  summarise(terms = list(term)) %>%
  mutate(terms = map(terms, paste, collapse = ", ")) %>% 
  unnest()

gamma_val_D <- gamma_D %>%
  group_by(topic) %>%
  summarise(gamma = mean(gamma)) %>%
  arrange(desc(gamma)) %>%
  left_join(top_D, by = "topic") %>%
  mutate(topic = paste0("Topic ", topic),
         topic = reorder(topic, gamma))

gamma_val_D$gamma

dfig <- gamma_val_D %>%
  ggplot(aes(topic, gamma, label = terms)) +
  geom_col(show.legend = FALSE) +
  geom_text(hjust =0, nudge_y = 0.0005, size = 4) +
  coord_flip() +
  scale_y_continuous(expand = c(0,0),
                     limits = c(0, 0.35),
                     labels = percent_format()) +
  theme_minimal() + 
  labs(x = NULL, y = expression(gamma),
       title = "Democratic Party")

###Combine
library(ggpubr)
ggarrange(rfig, dfig, ncol=1)

ggsave("figure_od1.png", height=7, width=10)


##########Exemplars: Figures 8 & 9
##Figure 8: Republicans
r1 <- findThoughts(stm_rep, 
                   texts=out_R$meta$rep_extreme_text, 
                   n=1, topics=1)$docs[[1]]
r2 <- findThoughts(stm_rep, 
                   texts=out_R$meta$rep_extreme_text, 
                   n=1, topics=2)$docs[[1]]
r3 <- findThoughts(stm_rep, 
                   texts=out_R$meta$rep_extreme_text, 
                   n=1, topics=3)$docs[[1]]
r4 <- findThoughts(stm_rep, 
                   texts=out_R$meta$rep_extreme_text, 
                   n=1, topics=4)$docs[[1]]
r5 <- findThoughts(stm_rep, 
                   texts=out_R$meta$rep_extreme_text, 
                   n=1, topics=5)$docs[[1]]
r6 <- findThoughts(stm_rep, 
                   texts=out_R$meta$rep_extreme_text, 
                   n=1, topics=6)$docs[[1]]
r7 <- findThoughts(stm_rep, 
                   texts=out_R$meta$rep_extreme_text, 
                   n=1, topics=7)$docs[[1]]

png("figure_od2.png", width=1920, height=1080)
par(mfrow=c(4,2), mar = c(5, .5, 3, .5))
plotQuote(r7, width=75, text.cex=2.5)
  mtext("Topic 7", cex=3)
plotQuote(r6, width=75, text.cex=2.5)
  mtext("Topic 6", cex = 3)
plotQuote(r1, width=75, text.cex=2.5)
  mtext("Topic 1", cex=3)
plotQuote(r2, width=75, text.cex=2.5)
  mtext("Topic 2", cex=3)
plotQuote(r5, width=75, text.cex=2.5)
  mtext("Topic 5", cex=3)
plotQuote(r3, width=75, text.cex=2.5)
  mtext("Topic 3", cex=3)
plotQuote(r4, width=75, text.cex=2.5)
  mtext("Topic 4", cex=3)
dev.off()

#####Democrats: Figure 9
#draw exemplar
d1 <- findThoughts(stm_dem, 
                   texts=out_D$meta$dem_extreme_text, 
                   n=1, topics=1)$docs[[1]]
d2 <- findThoughts(stm_dem, 
                   texts=out_D$meta$dem_extreme_text, 
                   n=1, topics=2)$docs[[1]]
d3 <- findThoughts(stm_dem, 
                   texts=out_D$meta$dem_extreme_text, 
                   n=1, topics=3)$docs[[1]]
d4 <- findThoughts(stm_dem, 
                   texts=out_D$meta$dem_extreme_text, 
                   n=1, topics=4)$docs[[1]]
d5 <- findThoughts(stm_dem, 
                   texts=out_D$meta$dem_extreme_text, 
                   n=1, topics=5)$docs[[1]]
d6 <- findThoughts(stm_dem, 
                   texts=out_D$meta$dem_extreme_text, 
                   n=1, topics=6)$docs[[1]]
d7 <- findThoughts(stm_dem, 
                   texts=out_D$meta$dem_extreme_text, 
                   n=1, topics=7)$docs[[1]]
#figure
png("figure_od3.png", width=1920, height=1080)
par(mfrow=c(4,2), mar = c(5, .5, 3, .5))
plotQuote(d6, width=75, text.cex=2.5)
mtext("Topic 6", cex = 3)
plotQuote(d3, width=75, text.cex=2.5)
mtext("Topic 3", cex=3)
plotQuote(d2, width=75, text.cex=2.5)
mtext("Topic 2", cex=3)
plotQuote(d5, width=75, text.cex=2.5)
mtext("Topic 5", cex=3)
plotQuote(d7, width=75, text.cex=2.5)
mtext("Topic 7", cex=3)
plotQuote(d4, width=75, text.cex=2.5)
mtext("Topic 4", cex=3)
plotQuote(d1, width=75, text.cex=2.5)
mtext("Topic 1", cex=3)
dev.off()


########EXemplars: Multiple Per Figure
####Republicans
#Topic 1
r1_5 <- findThoughts(stm_rep, 
                     texts=out_R$meta$rep_extreme_text, 
                     n=5, topics=1)$docs[[1]]

png("figure_od4.png", width=1920, height=1080)
plotQuote(r1_5, width=75, text.cex=2.5)
dev.off()

#Topic 2
r2_5 <- findThoughts(stm_rep, 
                     texts=out_R$meta$rep_extreme_text, 
                     n=5, topics=2)$docs[[1]]

png("figure_od5.png", width=1920, height=1080)
plotQuote(r2_5, width=75, text.cex=2.5)
dev.off()

#Topic 3
r3_5 <- findThoughts(stm_rep, 
                     texts=out_R$meta$rep_extreme_text, 
                     n=5, topics=3)$docs[[1]]

png("figure_od6.png", width=1920, height=1080)
plotQuote(r3_5, width=75, text.cex=2.5)
dev.off()

#Topic 4
r4_5 <- findThoughts(stm_rep, 
                     texts=out_R$meta$rep_extreme_text, 
                     n=5, topics=4)$docs[[1]]

png("figure_od7.png", width=1920, height=1080)
plotQuote(r4_5, width=75, text.cex=2.5)
dev.off()

#Topic 5
r5_5 <- findThoughts(stm_rep, 
                     texts=out_R$meta$rep_extreme_text, 
                     n=5, topics=5)$docs[[1]]

png("figure_od8.png", width=1920, height=1080)
plotQuote(r5_5, width=75, text.cex=2.5)
dev.off()

#Topic 6
r6_5 <- findThoughts(stm_rep, 
                     texts=out_R$meta$rep_extreme_text, 
                     n=5, topics=6)$docs[[1]]

png("figure_od9.png", width=1920, height=1080)
plotQuote(r6_5, width=75, text.cex=2.5)
dev.off()

#Topic 7
r7_5 <- findThoughts(stm_rep, 
                     texts=out_R$meta$rep_extreme_text, 
                     n=5, topics=7)$docs[[1]]

png("figure_od10.png", width=1920, height=1080)
plotQuote(r7_5, width=75, text.cex=2.5)
dev.off()

####Democrats
#Topic 1
d1_5 <- findThoughts(stm_dem, 
                     texts=out_D$meta$dem_extreme_text, 
                     n=5, topics=1)$docs[[1]]

png("figure_od11.png", width=1920, height=1080)
plotQuote(d1_5, width=75, text.cex=2)
dev.off()

#Topic 2
d2_5 <- findThoughts(stm_dem, 
                     texts=out_D$meta$dem_extreme_text, 
                     n=5, topics=2)$docs[[1]]

png("figure_od12.png", width=1920, height=1080)
plotQuote(d2_5, width=75, text.cex=2.5)
dev.off()

#Topic 3
d3_5 <- findThoughts(stm_dem, 
                     texts=out_D$meta$dem_extreme_text, 
                     n=5, topics=3)$docs[[1]]

png("figure_od13.png", width=1920, height=1080)
plotQuote(d3_5, width=75, text.cex=2.5)
dev.off()

#Topic 4
d4_5 <- findThoughts(stm_dem, 
                     texts=out_D$meta$dem_extreme_text, 
                     n=5, topics=4)$docs[[1]]

png("figure_od14.png", width=1920, height=1080)
plotQuote(d4_5, width=75, text.cex=2.5)
dev.off()

#Topic 5
d5_5 <- findThoughts(stm_dem, 
                     texts=out_D$meta$dem_extreme_text, 
                     n=5, topics=5)$docs[[1]]

png("figure_od15.png", width=1920, height=1080)
plotQuote(d5_5, width=75, text.cex=2.5)
dev.off()

#Topic 6
d6_5 <- findThoughts(stm_dem, 
                     texts=out_D$meta$dem_extreme_text, 
                     n=5, topics=6)$docs[[1]]

png("figure_od16.png", width=1920, height=1080)
plotQuote(d6_5, width=75, text.cex=2.5)
dev.off()

#Topic 7
d7_5 <- findThoughts(stm_dem, 
                     texts=out_D$meta$dem_extreme_text, 
                     n=5, topics=7)$docs[[1]]

png("figure_od17.png", width=1920, height=1080)
plotQuote(d7_5, width=75, text.cex=2.5)
dev.off()
